Catherine has prepared datafiles with sentences turned into fragments. I will use as input 60,000 fragments and 60,000 sentences. The fragments will come from the sentences. In the future the fragments will not be descendants of the input sentences. The labels will be either a 1 or 0, where 1 indicates a sentence and 0 indicates a fragment.
In [ ]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tflearn
from tflearn.data_utils import to_categorical
import spacy
nlp = spacy.load('en')
import re
from nltk.util import ngrams, trigrams
import csv
In [ ]:
texts = []
labels = []
with open("../.removingPOS/updatedSentences/conjunctionSentences/detailedRemoval.txt","r") as f:
for line in f:
asArray = line.split(" ||| ")
fragment = asArray[2].strip()
fragment = re.sub("\ \.", ".", fragment)
fragment = re.sub("\,\.", ".", fragment)
texts.append(fragment.capitalize())
labels.append(0)
texts.append(asArray[0].strip())
labels.append(1)
with open("../.removingPOS/updatedSentences/nounSentences/detailedRemoval.txt","r") as f:
for line in f:
asArray = line.split(" ||| ")
fragment = asArray[2].strip()
fragment = re.sub("\ \.", ".", fragment)
fragment = re.sub("\,\.", ".", fragment)
texts.append(fragment.capitalize())
labels.append(0)
texts.append(asArray[0].strip())
labels.append(1)
with open("../.removingPOS/updatedSentences/nounverbSentences/detailedRemoval.txt","r") as f:
for line in f:
asArray = line.split(" ||| ")
fragment = asArray[2].strip()
fragment = re.sub("\ \.", ".", fragment)
fragment = re.sub("\,\.", ".", fragment)
texts.append(fragment.capitalize())
labels.append(0)
texts.append(asArray[0].strip())
labels.append(1)
with open("../.removingPOS/updatedSentences/verbSentences/detailedRemoval.txt","r") as f:
for line in f:
asArray = line.split(" ||| ")
fragment = asArray[2].strip()
fragment = re.sub("\ \.", ".", fragment)
fragment = re.sub("\,\.", ".", fragment)
texts.append(fragment.capitalize())
labels.append(0)
texts.append(asArray[0].strip())
labels.append(1)
print(texts[-10:])
In [ ]:
import random
combined = list(zip(texts,labels))
random.shuffle(combined)
texts[:], labels[:] = zip(*combined)
print(texts[-10:])
print(labels[-10:])
In [ ]:
def textStringToPOSArray(text):
doc = nlp(text)
tags = []
for word in doc:
tags.append(word.pos_)
return tags
textStringToPOSArray(texts[3])
In [ ]:
def find_ngrams(input_list, n):
return zip(*[input_list[i:] for i in range(n)])
def getPOSTrigramsForTextString(text):
tags = textStringToPOSArray(text)
tgrams = list(trigrams(tags))
return tgrams
print("Text: ", texts[3], labels[3])
getPOSTrigramsForTextString(texts[3])
In [ ]:
def trigramsToDictKeys(trigrams):
keys = []
for trigram in trigrams:
keys.append('>'.join(trigram))
return keys
print(texts[2])
print(trigramsToDictKeys(getPOSTrigramsForTextString(texts[2])))
In [ ]:
from collections import Counter
c = Counter()
for textString in texts:
c.update(trigramsToDictKeys(getPOSTrigramsForTextString(textString)))
total_counts = c
print("Total words in data set: ", len(total_counts))
In [ ]:
vocab = sorted(total_counts, key=total_counts.get, reverse=True)[:1200]
print(vocab[:60])
In [ ]:
print(vocab[-1], ': ', total_counts[vocab[-1]])
Take the trigrams and index them
In [ ]:
word2idx = {n: i for i, n in enumerate(vocab)}## create the word-to-index dictionary here
print(word2idx)
In [ ]:
def textToTrigrams(text):
return trigramsToDictKeys(getPOSTrigramsForTextString(text))
def text_to_vector(text):
wordVector = np.zeros(len(vocab))
for word in textToTrigrams(text):
index = word2idx.get(word, None)
if index != None:
wordVector[index] += 1
return wordVector
In [ ]:
text_to_vector('The tea is for a party to celebrate '
'the movie so she has no time for a cake')[:65]
In [ ]:
word_vectors = np.zeros((len(texts), len(vocab)), dtype=np.int_)
for ii, text in enumerate(texts):
word_vectors[ii] = text_to_vector(text)
In [ ]:
# Printing out the first 5 word vectors
word_vectors[:5, :23]
In [ ]:
records = len(labels)
test_fraction = 0.9
train_split, test_split = int(records*test_fraction), int(records*test_fraction)
print(train_split, test_split)
trainX, trainY = word_vectors[:train_split], to_categorical(labels[:train_split], 2)
testX, testY = word_vectors[test_split:], to_categorical(labels[test_split:], 2)
In [ ]:
trainX[-1], trainY[-1]
In [ ]:
len(trainY), len(testY), len(trainY) + len(testY)
In [ ]:
# Network building
def build_model():
# This resets all parameters and variables, leave this here
tf.reset_default_graph()
#### Your code ####
net = tflearn.input_data([None, len(vocab)]) # Input
net = tflearn.fully_connected(net, 200, activation='ReLU') # Hidden
net = tflearn.fully_connected(net, 25, activation='ReLU') # Hidden
net = tflearn.fully_connected(net, 2, activation='softmax') # Output
net = tflearn.regression(net, optimizer='sgd', learning_rate=0.1, loss='categorical_crossentropy')
model = tflearn.DNN(net)
return model
In [ ]:
len(vocab)
In [ ]:
model = build_model()
In [ ]:
# Training
model.fit(trainX, trainY, validation_set=0.1, show_metric=True, batch_size=128, n_epoch=50)
In [ ]:
# Testing
predictions = (np.array(model.predict(testX))[:,0] >= 0.5).astype(np.int_)
test_accuracy = np.mean(predictions == testY[:,0], axis=0)
print("Test accuracy: ", test_accuracy)
In [ ]:
def test_sentence(sentence):
positive_prob = model.predict([text_to_vector(sentence)])[0][1]
print('Sentence: {}'.format(sentence))
print('P(positive) = {:.3f} :'.format(positive_prob),
'Positive' if positive_prob > 0.5 else 'Negative')
In [ ]:
test_sentence("Even though he had the better arguments and was by far the more powerful speaker.")
In [ ]:
test_sentence("Even though he had the better arguments and was by far the more powerful speaker, Peter lost the debate.")
In [ ]:
test_sentence("Working far into the night in an effort to salvage her little boat.")
In [ ]:
test_sentence("She was working far into the night in an effort to salvage her little boat.")
In [ ]:
test_sentence("The man eating pizza.")
In [ ]:
test_sentence("The man eating pizza is overwieght.")
In [ ]:
test_sentence("While we were swimming at the lake.")
In [ ]:
test_sentence("While we were swimming at the lake, we saw a fish.")
In [ ]:
test_sentence("Keep going.")
In [ ]:
test_sentence("A time of wonder and amazement")
In [ ]:
test_sentence("That was a time of wonder and amazement")
In [ ]:
test_sentence("Since she never saw that movie.")
In [ ]:
test_sentence("We should invite her, since she never saw that movie.")
In [ ]:
test_sentence("Affecting the lives of many students in New York City.")
In [ ]:
test_sentence("Quill is affecting the lives of many students in New York City.")
In [ ]:
test_sentence("Standing on the edge of the cliff looking down.")
In [ ]:
test_sentence("I'm standing on the edge of the cliff and looking down.")
In [ ]:
test_sentence("The team looked forward to victory.")
In [ ]:
model.save("./model.tfl")
Save the vocab
In [ ]:
w = csv.writer(open("./vocabindex.csv", "w"))
for key, val in word2idx.items():
w.writerow([key, val])
In [ ]:
vocab
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]:
In [ ]: